
# basic setup
rm(list=ls())
library(readstata13)
library(gbm)
set.seed(1234)

# read in data saved as a Stata 13 dta file
df <- read.dta13("ml_v5.dta")

# format data
df$court <- as.factor(df$court)
df$baildow <- as.factor(df$baildow)
df$bailmonth <- as.factor(df$bailmonth)
df$bailshift <- as.factor(df$bailshift)
df$bailyear <- as.factor(df$bailyear)

# create pooled training set
train.pooled <- sample(1:nrow(df), floor(nrow(df) * 0.8))
df$ml.train.pooled <- 1
df[-train.pooled,]$ml.train.pooled <- 0

# subset pooled training set for gbm on those who were released
train.pooled.gbm <- which(df$ml.train.pooled==1 & df$bail_met_ever==1)

# classification (rearrest)
ptm <- proc.time()
boost.df.mp <- gbm(recid_arrest_onbail ~ . ,
                   data = df[train.pooled.gbm, c(6, 10:(ncol(df) - 1))],
                   distribution = "bernoulli",
                   n.trees = 8000,
                   interaction.depth = 5,
                   shrinkage = .005,
                   cv.folds = 5,
                   verbose = FALSE)
proc.time() - ptm

# best number of trees from cv estimate
par(mfrow = c(1, 1))
best.n.mp <- gbm.perf(boost.df.mp, method = "cv")
best.n.mp

# show summary of most important variables
summary.gbm(boost.df.mp)

# apply prediction to full sample
df$recid_arrest_onbail_yhat_mp <- predict(boost.df.mp,
                                          newdata = df,
                                          n.trees = best.n.mp,
                                          type = "response")
df$recid_arrest_onbail_mp <- ifelse(df$recid_arrest_onbail_yhat_mp > 0.5, 1, 0)

# classification (drug rearrest)
ptm <- proc.time()
boost.df.dp <- gbm(recid_arrest_onbail_drug ~ . ,
                   data = df[train.pooled.gbm, c(7, 10:(ncol(df) - 3))],
                   distribution = "bernoulli",
                   n.trees = 8000,
                   interaction.depth = 5,
                   shrinkage = .005,
                   cv.folds = 5,
                   verbose = FALSE)
proc.time() - ptm

# best number of trees from cv estimate
par(mfrow = c(1, 1))
best.n.dp <- gbm.perf(boost.df.dp, method = "cv")
best.n.dp

# show summary of most important variables
summary.gbm(boost.df.dp)

# apply prediction to full sample
df$recid_arrest_onbail_yhat_dp <- predict(boost.df.dp,
                                          newdata = df,
                                          n.trees = best.n.dp,
                                          type = "response")
df$recid_arrest_onbail_dp <- ifelse(df$recid_arrest_onbail_yhat_dp > 0.5, 1, 0)

# classification (property rearrest)
ptm <- proc.time()
boost.df.pp <- gbm(recid_arrest_onbail_property ~ . ,
                   data = df[train.pooled.gbm, c(8, 10:(ncol(df) - 5))],
                   distribution = "bernoulli",
                   n.trees = 8000,
                   interaction.depth = 5,
                   shrinkage = .005,
                   cv.folds = 5,
                   verbose = FALSE)
proc.time() - ptm

# best number of trees from cv estimate
par(mfrow = c(1, 1))
best.n.pp <- gbm.perf(boost.df.pp, method = "cv")
best.n.pp

# show summary of most important variables
summary.gbm(boost.df.pp)

# apply prediction to full sample
df$recid_arrest_onbail_yhat_pp <- predict(boost.df.pp,
                                          newdata = df,
                                          n.trees = best.n.pp,
                                          type = "response")
df$recid_arrest_onbail_pp <- ifelse(df$recid_arrest_onbail_yhat_pp > 0.5, 1, 0)

# classification (violent rearrest)
ptm <- proc.time()
boost.df.vp <- gbm(recid_arrest_onbail_violent ~ . ,
                   data = df[train.pooled.gbm, c(9, 10:(ncol(df) - 7))],
                   distribution = "bernoulli",
                   n.trees = 8000,
                   interaction.depth = 5,
                   shrinkage = .005,
                   cv.folds = 5,
                   verbose = FALSE)
proc.time() - ptm

# best number of trees from cv estimate
par(mfrow = c(1, 1))
best.n.vp <- gbm.perf(boost.df.vp, method = "cv")
best.n.vp

# show summary of most important variables
summary.gbm(boost.df.vp)

# apply prediction to full sample
df$recid_arrest_onbail_yhat_vp <- predict(boost.df.vp,
                                          newdata = df,
                                          n.trees = best.n.vp,
                                          type = "response")
df$recid_arrest_onbail_vp <- ifelse(df$recid_arrest_onbail_yhat_vp > 0.5, 1, 0)


# subset training set for gbm on those with lenient judges who were released
train.pooled.gbm.2 <- which(df$bail_met_ever==1 & df$flag_lenient==1)

# classification
ptm <- proc.time()
boost.df.ml <- gbm(recid_arrest_onbail ~ . ,
                   data = df[train.pooled.gbm.2, c(6, 10:60)],
                   distribution = "bernoulli",
                   n.trees = 8000,
                   interaction.depth = 5,
                   shrinkage = .005,
                   cv.folds = 5,
                   verbose = FALSE)
proc.time() - ptm

# best number of trees from cv estimate
par(mfrow = c(1, 1))
best.n.ml <- gbm.perf(boost.df.ml, method = "cv")
best.n.ml

# show summary of most important variables
summary.gbm(boost.df.ml)

# apply prediction to full sample
df$recid_arrest_onbail_yhat_ml <- predict(boost.df.ml,
                                          newdata = df,
                                          n.trees = best.n.ml,
                                          type = "response")
df$recid_arrest_onbail_ml <- ifelse(df$recid_arrest_onbail_yhat_ml > 0.5, 1, 0)

# collect predictions
ml.predict <- data.frame(df[,c(1,(ncol(df)-10):ncol(df))])

# export predictions to dta
save.dta13(ml.predict, "ml_predict_final.dta", convert.underscore = TRUE)

  